{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.core.interactiveshell import InteractiveShell\n",
    "InteractiveShell.ast_node_interactivity = 'all'  # default is ‘last_expr'\n",
    "\n",
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.append('/home/mink/notebooks/CameraTraps')  # append this repo to PYTHONPATH\n",
    "sys.path.append('/home/mink/lib/ai4eutils')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import os\n",
    "from collections import Counter, defaultdict\n",
    "from random import sample\n",
    "from shutil import copyfile\n",
    "from multiprocessing.pool import ThreadPool\n",
    "from datetime import datetime\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "\n",
    "import path_utils  # ai4eutils\n",
    "\n",
    "from data_management.megadb.schema import sequences_schema_check\n",
    "from data_management.megadb.converters.cct_to_megadb import process_sequences"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# fws_butler_redrock"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset_name = 'fws_butler_redrock'\n",
    "\n",
    "container_root = '/mink_disk_0/camtraps/fws-butler/'  \n",
    "\n",
    "path_to_output = f'/home/mink/camtraps/data/megadb_jsons/{dataset_name}.json' \n",
    "path_to_output_temp = f'/home/mink/camtraps/data/megadb_jsons/{dataset_name}_temp.json' "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 0 - Add an entry to the `datasets` table\n",
    "\n",
    "Done"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 1 - Prepare the `sequence` objects to insert into the database\n",
    "\n",
    "Images are put into species folders at the bottom-most directory. Empty is denoted \"Ghost\". No sequence info recovered.\n",
    "\n",
    "Locations are consistent across the two Redrock folders."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "21634"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "21634"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "paths = path_utils.recursive_file_list(container_root)\n",
    "len(paths)\n",
    "paths = sorted([p.split(container_root)[1] for p in paths if path_utils.is_image_file(p)])\n",
    "len(paths)\n",
    "# some corrupted images (a few bytes in size) start with a . in the actual name - to filter out next"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 21634/21634 [00:00<00:00, 76917.24it/s]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "21587"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sequences = []\n",
    "locations = set()\n",
    "\n",
    "for p in tqdm(paths):\n",
    "    basename = os.path.basename(p)\n",
    "    if basename.startswith('.'):\n",
    "        continue\n",
    "    \n",
    "    p_parts = p.split('/')\n",
    "    \n",
    "    assert len(p_parts) == 5 or len(p_parts) == 4, p\n",
    "    \n",
    "    location = p_parts[1]\n",
    "    locations.add(location)\n",
    "    \n",
    "    if len(p_parts) == 5:\n",
    "        species = p_parts[2].lower()\n",
    "        if species == 'ghost':\n",
    "            species = 'empty'\n",
    "    else:\n",
    "        species = '__label_unavailable'\n",
    "\n",
    "    b_parts = basename.split('.')[0].split(' ')\n",
    "    if len(b_parts) != 6:\n",
    "        continue # 4 images says \"Copy\", 1 Unknown and 3 Bighorn\n",
    "        \n",
    "    timestamp = datetime(\n",
    "        year=int(b_parts[0]),\n",
    "        month=int(b_parts[1]),\n",
    "        day=int(b_parts[2]),\n",
    "        hour=int(b_parts[3]),\n",
    "        minute=int(b_parts[4]),\n",
    "        second=int(b_parts[5])\n",
    "    )\n",
    "    \n",
    "    sequences.append({\n",
    "        'dataset': dataset_name,\n",
    "        'seq_id': f'dummy_redrock_{location}_{len(sequences)}',\n",
    "        'location': location,\n",
    "        'class': [species],\n",
    "        'images': [{\n",
    "            'file': p,\n",
    "            'frame_num': 1,\n",
    "            'datetime': str(timestamp)\n",
    "        }]\n",
    "    })\n",
    "        \n",
    "len(sequences)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "datetime.datetime(2021, 2, 26, 1, 10, 12)"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "'2021-02-26 01:10:12'"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "datetime(year=2021, month=2, day=26, hour=1, minute=10, second=12)\n",
    "str(datetime(year=2021, month=2, day=26, hour=1, minute=10, second=12))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "24"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(locations)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'dataset': 'fws_butler_redrock',\n",
       " 'seq_id': 'dummy_redrock_Solar_21487',\n",
       " 'location': 'Solar',\n",
       " 'class': ['javelina'],\n",
       " 'images': [{'file': 'RedrockPics2/Solar/Javelina/10/2017 12 24 14 47 47.JPG',\n",
       "   'frame_num': 1,\n",
       "   'datetime': '2017-12-24 14:47:47'}]}"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sequences[-100]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 2 - Pass the schema check\n",
    "\n",
    "Once your metadata are in the MegaDB format for `sequence` items, we check that they conform to the format's schema.\n",
    "\n",
    "If the format conforms, the following messages will be printed:\n",
    "\n",
    "```\n",
    "Verified that the sequence items meet requirements not captured by the schema.\n",
    "Verified that the sequence items conform to the schema.\n",
    "```\n",
    "\n",
    "For large datasets, the second step will take some time (~ a minute). \n",
    "\n",
    "Otherwise there will be an error message describing what's wrong. Please fix the issues until all checks are passed. You might need to write some snippets of code to loop through the `sequence` items to understand which entries have problems."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Verified that the sequence items meet requirements not captured by the schema.\n",
      "Verified that the sequence items conform to the schema.\n",
      "CPU times: user 2.88 s, sys: 428 µs, total: 2.88 s\n",
      "Wall time: 2.88 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "sequences_schema_check.sequences_schema_check(sequences)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(path_to_output_temp, 'w', encoding='utf-8') as f:\n",
    "    json.dump(sequences, f, indent=1, ensure_ascii=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 2b - copy images to flat folder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "def copy_file(src_path, dst_path):\n",
    "    return copyfile(src_path, dst_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 21587/21587 [00:00<00:00, 159790.55it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 111 ms, sys: 28.1 ms, total: 139 ms\n",
      "Wall time: 137 ms\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "path_pairs = []\n",
    "for seq in tqdm(sequences):\n",
    "    \n",
    "    if 'empty' in seq['class']:\n",
    "        continue\n",
    "    \n",
    "    seq_id = seq['seq_id']\n",
    "    for im in seq['images']:\n",
    "        src_path = os.path.join(container_root, im['file'])\n",
    "        assert os.path.exists(src_path), src_path\n",
    "        frame = 1\n",
    "        dst_path = os.path.join('/mink_disk_0/camtraps/imerit12c', \n",
    "                                f'{dataset_name}.seq{seq_id}.frame{frame}.jpg')\n",
    "        path_pairs.append((src_path, dst_path))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "15680"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "('/mink_disk_0/camtraps/fws-butler/RedrockPics/G9/Bighorn/01/2017 09 01 14 15 20.JPG',\n",
       " '/mink_disk_0/camtraps/imerit12c/fws_butler_redrock.seqdummy_redrock_G9_2491.frame1.jpg')"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(path_pairs)\n",
    "path_pairs[1000]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 13.6 s, sys: 45.9 s, total: 59.5 s\n",
      "Wall time: 2min 57s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "with ThreadPool(8) as pool:\n",
    "    dst_paths = pool.starmap(copy_file, path_pairs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "15680"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(dst_paths)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:cameratraps] *",
   "language": "python",
   "name": "conda-env-cameratraps-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
